{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Cluster Likert Questions" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Original survey data:\n" ] }, { "data": { "application/vnd.microsoft.datawrangler.viewer.v0+json": { "columns": [ { "name": "index", "rawType": "int64", "type": "integer" }, { "name": "respondent_id", "rawType": "int64", "type": "integer" }, { "name": "q1_ease_of_use", "rawType": "object", "type": "string" }, { "name": "q2_product_quality", "rawType": "object", "type": "string" }, { "name": "q3_value_for_money", "rawType": "object", "type": "string" }, { "name": "q4_customer_service", "rawType": "object", "type": "string" }, { "name": "q5_would_recommend", "rawType": "object", "type": "string" }, { "name": "q6_meets_expectations", "rawType": "object", "type": "string" }, { "name": "q7_better_than_competitors", "rawType": "object", "type": "string" }, { "name": "q8_overall_satisfaction", "rawType": "object", "type": "string" } ], "conversionMethod": "pd.DataFrame", "ref": "1e525443-79e3-4d45-87f2-80f1812057d5", "rows": [ [ "0", "1", "Agree", "Strongly Agree", "Strongly Agree", "Agree", "Strongly Agree", "Neither Agree nor Disagree", "Neither Agree nor Disagree", "Agree" ], [ "1", "2", "Strongly Agree", "Strongly Agree", "Strongly Agree", "Agree", "Agree", "Strongly Agree", "Strongly Agree", "Agree" ], [ "2", "3", "Strongly Agree", "Neither Agree nor Disagree", "Agree", "Neither Agree nor Disagree", "Strongly Agree", "Agree", "Strongly Agree", "Strongly Agree" ], [ "3", "4", "Agree", "Agree", "Strongly Agree", "Agree", "Strongly Agree", "Strongly Agree", "Strongly Agree", "Agree" ], [ "4", "5", "Agree", "Strongly Agree", "Agree", "Agree", "Strongly Agree", "Agree", "Strongly Agree", "Agree" ] ], "shape": { "columns": 9, "rows": 5 } }, "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
respondent_idq1_ease_of_useq2_product_qualityq3_value_for_moneyq4_customer_serviceq5_would_recommendq6_meets_expectationsq7_better_than_competitorsq8_overall_satisfaction
01AgreeStrongly AgreeStrongly AgreeAgreeStrongly AgreeNeither Agree nor DisagreeNeither Agree nor DisagreeAgree
12Strongly AgreeStrongly AgreeStrongly AgreeAgreeAgreeStrongly AgreeStrongly AgreeAgree
23Strongly AgreeNeither Agree nor DisagreeAgreeNeither Agree nor DisagreeStrongly AgreeAgreeStrongly AgreeStrongly Agree
34AgreeAgreeStrongly AgreeAgreeStrongly AgreeStrongly AgreeStrongly AgreeAgree
45AgreeStrongly AgreeAgreeAgreeStrongly AgreeAgreeStrongly AgreeAgree
\n", "
" ], "text/plain": [ " respondent_id q1_ease_of_use q2_product_quality \\\n", "0 1 Agree Strongly Agree \n", "1 2 Strongly Agree Strongly Agree \n", "2 3 Strongly Agree Neither Agree nor Disagree \n", "3 4 Agree Agree \n", "4 5 Agree Strongly Agree \n", "\n", " q3_value_for_money q4_customer_service q5_would_recommend \\\n", "0 Strongly Agree Agree Strongly Agree \n", "1 Strongly Agree Agree Agree \n", "2 Agree Neither Agree nor Disagree Strongly Agree \n", "3 Strongly Agree Agree Strongly Agree \n", "4 Agree Agree Strongly Agree \n", "\n", " q6_meets_expectations q7_better_than_competitors \\\n", "0 Neither Agree nor Disagree Neither Agree nor Disagree \n", "1 Strongly Agree Strongly Agree \n", "2 Agree Strongly Agree \n", "3 Strongly Agree Strongly Agree \n", "4 Agree Strongly Agree \n", "\n", " q8_overall_satisfaction \n", "0 Agree \n", "1 Agree \n", "2 Strongly Agree \n", "3 Agree \n", "4 Agree " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# 03_cluster_likert_questions.ipynb\n", "import pandas as pd\n", "import numpy as np\n", "from pandas_survey_toolkit import nlp\n", "from pandas_survey_toolkit.vis import cluster_heatmap_plot\n", "\n", "# Create sample survey data with Likert scale responses\n", "# Let's simulate a product satisfaction survey with 20 respondents and 8 Likert questions\n", "\n", "# Define our questions\n", "questions = [\n", " 'q1_ease_of_use', \n", " 'q2_product_quality',\n", " 'q3_value_for_money',\n", " 'q4_customer_service',\n", " 'q5_would_recommend',\n", " 'q6_meets_expectations',\n", " 'q7_better_than_competitors',\n", " 'q8_overall_satisfaction'\n", "]\n", "\n", "# Define our Likert scale options\n", "likert_options = [\n", " 'Strongly Disagree',\n", " 'Disagree',\n", " 'Neither Agree nor Disagree',\n", " 'Agree',\n", " 'Strongly Agree'\n", "]\n", "\n", "POPULATION = 200\n", "# Create DataFrame with 20 respondents\n", "np.random.seed(42)\n", "data = {'respondent_id': range(1, POPULATION)}\n", "\n", "# Generate random Likert responses with some patterns\n", "# Group 1 (respondents 1-7): Generally positive\n", "# Group 2 (respondents 8-14): Generally negative\n", "# Group 3 (respondents 15-20): Mixed responses\n", "\n", "for q in questions:\n", " responses = []\n", " for i in range(1, POPULATION):\n", " if i <= (0.3 * POPULATION): # Positive group\n", " responses.append(np.random.choice(likert_options[2:], p=[0.1, 0.5, 0.4]))\n", " elif i <= (0.6 * POPULATION): # Negative group\n", " responses.append(np.random.choice(likert_options[:3], p=[0.3, 0.5, 0.2]))\n", " else: # Don't care group\n", " responses.append(np.random.choice(likert_options[1:4], p=[0.1,0.8,0.1]))\n", " data[q] = responses\n", "\n", "# Create DataFrame\n", "df = pd.DataFrame(data)\n", "\n", "# Display the original data\n", "print(\"Original survey data:\")\n", "display(df.head())\n", "\n", "# Define custom mapping for Likert scale values\n", "custom_mapping = {\n", " 'strongly disagree': -1,\n", " 'disagree': -1,\n", " 'neither agree nor disagree': 0,\n", " 'agree': 1,\n", " 'strongly agree': 1\n", "}\n" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Using default mapping:\n", "-1: Phrases containing 'disagree', 'do not agree', etc.\n", " 0: Phrases containing 'neutral', 'neither', 'unsure', etc.\n", "+1: Phrases containing 'agree' (but not 'disagree' or 'not agree')\n", "NaN: NaN values are preserved\n", " Agree -> 1: 282 times\n", " Strongly Agree -> 1: 199 times\n", " Neither Agree nor Disagree -> 0: 668 times\n", " Disagree -> -1: 293 times\n", " Strongly Disagree -> -1: 150 times\n", "\n", "Encoded Likert data:\n" ] }, { "data": { "application/vnd.microsoft.datawrangler.viewer.v0+json": { "columns": [ { "name": "index", "rawType": "int64", "type": "integer" }, { "name": "respondent_id", "rawType": "int64", "type": "integer" }, { "name": "likert_encoded_q1_ease_of_use", "rawType": "int64", "type": "integer" }, { "name": "likert_encoded_q2_product_quality", "rawType": "int64", "type": "integer" }, { "name": "likert_encoded_q3_value_for_money", "rawType": "int64", "type": "integer" }, { "name": "likert_encoded_q4_customer_service", "rawType": "int64", "type": "integer" }, { "name": "likert_encoded_q5_would_recommend", "rawType": "int64", "type": "integer" }, { "name": "likert_encoded_q6_meets_expectations", "rawType": "int64", "type": "integer" }, { "name": "likert_encoded_q7_better_than_competitors", "rawType": "int64", "type": "integer" }, { "name": "likert_encoded_q8_overall_satisfaction", "rawType": "int64", "type": "integer" } ], "conversionMethod": "pd.DataFrame", "ref": "0fb30361-1b2d-4d07-b4f8-f350bffdc906", "rows": [ [ "0", "1", "1", "1", "1", "1", "1", "0", "0", "1" ], [ "1", "2", "1", "1", "1", "1", "1", "1", "1", "1" ], [ "2", "3", "1", "0", "1", "0", "1", "1", "1", "1" ], [ "3", "4", "1", "1", "1", "1", "1", "1", "1", "1" ], [ "4", "5", "1", "1", "1", "1", "1", "1", "1", "1" ] ], "shape": { "columns": 9, "rows": 5 } }, "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
respondent_idlikert_encoded_q1_ease_of_uselikert_encoded_q2_product_qualitylikert_encoded_q3_value_for_moneylikert_encoded_q4_customer_servicelikert_encoded_q5_would_recommendlikert_encoded_q6_meets_expectationslikert_encoded_q7_better_than_competitorslikert_encoded_q8_overall_satisfaction
0111111001
1211111111
2310101111
3411111111
4511111111
\n", "
" ], "text/plain": [ " respondent_id likert_encoded_q1_ease_of_use \\\n", "0 1 1 \n", "1 2 1 \n", "2 3 1 \n", "3 4 1 \n", "4 5 1 \n", "\n", " likert_encoded_q2_product_quality likert_encoded_q3_value_for_money \\\n", "0 1 1 \n", "1 1 1 \n", "2 0 1 \n", "3 1 1 \n", "4 1 1 \n", "\n", " likert_encoded_q4_customer_service likert_encoded_q5_would_recommend \\\n", "0 1 1 \n", "1 1 1 \n", "2 0 1 \n", "3 1 1 \n", "4 1 1 \n", "\n", " likert_encoded_q6_meets_expectations \\\n", "0 0 \n", "1 1 \n", "2 1 \n", "3 1 \n", "4 1 \n", "\n", " likert_encoded_q7_better_than_competitors \\\n", "0 0 \n", "1 1 \n", "2 1 \n", "3 1 \n", "4 1 \n", "\n", " likert_encoded_q8_overall_satisfaction \n", "0 1 \n", "1 1 \n", "2 1 \n", "3 1 \n", "4 1 " ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Question clustering results:\n" ] }, { "data": { "application/vnd.microsoft.datawrangler.viewer.v0+json": { "columns": [ { "name": "index", "rawType": "int64", "type": "integer" }, { "name": "respondent_id", "rawType": "int64", "type": "integer" }, { "name": "question_cluster_id", "rawType": "float64", "type": "float" }, { "name": "question_cluster_probability", "rawType": "float64", "type": "float" } ], "conversionMethod": "pd.DataFrame", "ref": "254dc5ba-94ba-4684-99a5-a022533a00b8", "rows": [ [ "0", "1", "0.0", "0.5702343598374491" ], [ "1", "2", "0.0", "1.0" ], [ "2", "3", "0.0", "0.5627238816993246" ], [ "3", "4", "0.0", "1.0" ], [ "4", "5", "0.0", "1.0" ] ], "shape": { "columns": 3, "rows": 5 } }, "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
respondent_idquestion_cluster_idquestion_cluster_probability
010.00.570234
120.01.000000
230.00.562724
340.01.000000
450.01.000000
\n", "
" ], "text/plain": [ " respondent_id question_cluster_id question_cluster_probability\n", "0 1 0.0 0.570234\n", "1 2 0.0 1.000000\n", "2 3 0.0 0.562724\n", "3 4 0.0 1.000000\n", "4 5 0.0 1.000000" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Cluster heatmap showing the sentiment distribution across questions:\n" ] }, { "data": { "text/html": [ "\n", "\n", "
\n", "" ], "text/plain": [ "alt.VConcatChart(...)" ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Cluster averages for each question:\n" ] }, { "data": { "application/vnd.microsoft.datawrangler.viewer.v0+json": { "columns": [ { "name": "question_cluster_id", "rawType": "float64", "type": "float" }, { "name": "likert_encoded_q1_ease_of_use", "rawType": "float64", "type": "float" }, { "name": "likert_encoded_q2_product_quality", "rawType": "float64", "type": "float" }, { "name": "likert_encoded_q3_value_for_money", "rawType": "float64", "type": "float" }, { "name": "likert_encoded_q4_customer_service", "rawType": "float64", "type": "float" }, { "name": "likert_encoded_q5_would_recommend", "rawType": "float64", "type": "float" }, { "name": "likert_encoded_q6_meets_expectations", "rawType": "float64", "type": "float" }, { "name": "likert_encoded_q7_better_than_competitors", "rawType": "float64", "type": "float" }, { "name": "likert_encoded_q8_overall_satisfaction", "rawType": "float64", "type": "float" } ], "conversionMethod": "pd.DataFrame", "ref": "8df84408-1730-41d7-b372-6aca72ddf886", "rows": [ [ "0.0", "0.8524590163934426", "0.8852459016393442", "0.819672131147541", "0.8688524590163934", "0.9180327868852459", "0.8524590163934426", "0.8852459016393442", "0.8852459016393442" ], [ "1.0", "0.04081632653061224", "-0.08163265306122448", "0.12244897959183673", "0.08163265306122448", "0.08163265306122448", "-0.04081632653061224", "0.1836734693877551", "0.0" ], [ "2.0", "0.0", "0.0", "0.0", "0.0", "0.0", "0.0", "0.0", "0.0" ], [ "3.0", "-0.6805555555555556", "-0.6944444444444444", "-0.5694444444444444", "-0.7083333333333334", "-0.8333333333333334", "-0.6666666666666666", "-0.7777777777777778", "-0.7083333333333334" ] ], "shape": { "columns": 8, "rows": 4 } }, "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
likert_encoded_q1_ease_of_uselikert_encoded_q2_product_qualitylikert_encoded_q3_value_for_moneylikert_encoded_q4_customer_servicelikert_encoded_q5_would_recommendlikert_encoded_q6_meets_expectationslikert_encoded_q7_better_than_competitorslikert_encoded_q8_overall_satisfaction
question_cluster_id
0.00.8524590.8852460.8196720.8688520.9180330.8524590.8852460.885246
1.00.040816-0.0816330.1224490.0816330.081633-0.0408160.1836730.000000
2.00.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
3.0-0.680556-0.694444-0.569444-0.708333-0.833333-0.666667-0.777778-0.708333
\n", "
" ], "text/plain": [ " likert_encoded_q1_ease_of_use \\\n", "question_cluster_id \n", "0.0 0.852459 \n", "1.0 0.040816 \n", "2.0 0.000000 \n", "3.0 -0.680556 \n", "\n", " likert_encoded_q2_product_quality \\\n", "question_cluster_id \n", "0.0 0.885246 \n", "1.0 -0.081633 \n", "2.0 0.000000 \n", "3.0 -0.694444 \n", "\n", " likert_encoded_q3_value_for_money \\\n", "question_cluster_id \n", "0.0 0.819672 \n", "1.0 0.122449 \n", "2.0 0.000000 \n", "3.0 -0.569444 \n", "\n", " likert_encoded_q4_customer_service \\\n", "question_cluster_id \n", "0.0 0.868852 \n", "1.0 0.081633 \n", "2.0 0.000000 \n", "3.0 -0.708333 \n", "\n", " likert_encoded_q5_would_recommend \\\n", "question_cluster_id \n", "0.0 0.918033 \n", "1.0 0.081633 \n", "2.0 0.000000 \n", "3.0 -0.833333 \n", "\n", " likert_encoded_q6_meets_expectations \\\n", "question_cluster_id \n", "0.0 0.852459 \n", "1.0 -0.040816 \n", "2.0 0.000000 \n", "3.0 -0.666667 \n", "\n", " likert_encoded_q7_better_than_competitors \\\n", "question_cluster_id \n", "0.0 0.885246 \n", "1.0 0.183673 \n", "2.0 0.000000 \n", "3.0 -0.777778 \n", "\n", " likert_encoded_q8_overall_satisfaction \n", "question_cluster_id \n", "0.0 0.885246 \n", "1.0 0.000000 \n", "2.0 0.000000 \n", "3.0 -0.708333 " ] }, "metadata": {}, "output_type": "display_data" }, { "name": "stdout", "output_type": "stream", "text": [ "\n", "Number of respondents in each cluster:\n" ] }, { "data": { "text/plain": [ "question_cluster_id\n", "0.0 61\n", "1.0 49\n", "2.0 17\n", "3.0 72\n", "Name: count, dtype: int64" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "\n", "# Use pandas method chaining to process the data\n", "df_processed = (df\n", " # Cluster the questions\n", " .cluster_questions(\n", " columns=questions, \n", " #likert_mapping=custom_mapping, default handles most cases\n", " umap_n_neighbors=15,\n", " hdbscan_min_cluster_size=15,\n", " cluster_selection_epsilon=0.35,\n", " \n", " )\n", ")\n", "\n", "# Get the list of encoded Likert columns\n", "likert_columns_with_prefix = [f\"likert_encoded_{q}\" for q in questions]\n", "\n", "# Display encoded data\n", "print(\"\\nEncoded Likert data:\")\n", "display(df_processed[['respondent_id'] + likert_columns_with_prefix].head())\n", "\n", "# Display clustering results\n", "print(\"\\nQuestion clustering results:\")\n", "display(df_processed[['respondent_id', 'question_cluster_id', 'question_cluster_probability']].head())\n", "\n", "\n", "# Use the cluster_heatmap_plot function to visualize cluster patterns\n", "print(\"\\nCluster heatmap showing the sentiment distribution across questions:\")\n", "heatmap = cluster_heatmap_plot(\n", " df=df_processed,\n", " x=\"question_cluster_id\", # Cluster IDs as the x-axis\n", " y=likert_columns_with_prefix, # Encoded Likert columns to analyze\n", " max_width=30 # For better readability\n", ")\n", "\n", "# Display the heatmap\n", "display(heatmap)\n", "\n", "# Let's also add a simple interpretation of the clusters\n", "cluster_summary = df_processed.groupby('question_cluster_id')[likert_columns_with_prefix].mean()\n", "print(\"\\nCluster averages for each question:\")\n", "display(cluster_summary)\n", "\n", "# Calculate respondent counts per cluster\n", "cluster_counts = df_processed['question_cluster_id'].value_counts().sort_index()\n", "print(\"\\nNumber of respondents in each cluster:\")\n", "display(cluster_counts)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.4" } }, "nbformat": 4, "nbformat_minor": 2 }